library("quanteda")
library("topicmodels")
library("readtext")
library(readr)
library(tidytext)
library(dplyr)
library(ggplot2)

d_data <- read_csv(# specify location make sure file contains only one column!
d_data$text <- d_data$X1 # omdat corpus een variable nodig heeft die "text" heet, kopieren we deze
d_data$X1 <- NULL # we verwijderen X1
d_corpus <- corpus(d_data) # translate into corpus
d_corpus <- tolower(d_corpus)

#split hele file op in tokens, apparte woorden 
d_tokens <- tokens(d_corpus) 

# breidt het stopwoordenboek wat uit in functie van de data
new_stopwords <- c("👇","🔵", "🌹","✅","🇬🇧","💙", "✔","👉","🚨","❌",
                   "🇬🇧","🌳","🗳","🗣","📢","📆","⬇","⤵",
                   "minut*", "hour*", "amp", "http*", "@*", "#*", "even", "will", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "extra", "use", "rail", "today", "yesterday", "tomorrow", "now", "full",  "bring", "big", "say", "meet", "across", "add", "step", "s", "t", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "summer", "winter", "spring", "autumn", "ever", "pm", "am", "rt", "back", "like", "still", "see", "end", "first", "many", "come", "yet", "real", "find", "sure", "never", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", "day*", "week*", "month*", "always", "since", "become", "keep", "let", "can", "ahead", "new", "hard", "made", "ago", "just", "still", stopwords("english"))
# translate into dataframe en verwijder stopwoorden, punten en nummers
d_dfm <- dfm(d_corpus, remove = new_stopwords, remove_punct = TRUE, remove_numbers = TRUE, stem = FALSE)

COVID_dictionary <- dictionary(list(
                 corona_words = c("🦠", "*corona*", "*covid*", "*virus*", "*demic*"),
                 crisis_words = c("🏴", "*wave*", "*impact*", "affect*",  "*crisis*", "*crises*", "*criti*", "*death*", "*patient*", "*intensive*", "*plagu*"),
                 manage_words = c("💪", "👏", "👍", "👍", "effect*", "ensur*", "constrain", "*secur*", "alert", "rebuild", "symptoms", "fight", "*beat*", "*war", "*prevail*", "nation", "*vaccin*", "*cure*", "*moral*", "mental*", "treated", "thought*", "*understand*", "*curing*", "*remed*", "*effort*", "*home*", "*manag*", "*control*", "*help*", "*thank*", "*clap*", "*beat*", "win"),
                 blame_words = c("chin*", "blam*", "strateg*"),
                 economy_words = c("firm*", "financ*", "econom*", "innovat*", "trade*", "trading")))

sim <- textstat_simil(d_dfm, d_dfm[, c("debt")], method = "cosine", margin = "features")
lapply(as.list(sim), head, 18)

tokens_results <- dfm(d_tokens, dictionary = COVID_dictionary, remove = new_stopwords, remove_punct = TRUE) # niet tokens_lookup, want deze is later lastig te tellen
counts <- convert(tokens_results, to ="data.frame") # exporteer counts naar een dataframe
tokens_results
sum(counts$corona) # sommeer alle counts
sum(counts$crisis)
sum(counts$manage)

corona_words = c("🦠", "*corona*", "*covid*", "*virus*", "*demic*")
crisis_words = c("🏴", "*wave*", "*impact*", "affect*", "*crisis*", "*crises*", "*criti*", "*death*", "*patient*", "*intensive*", "*plagu*")
manage_words = c("💪", "👏", "👍", "👍", "effect*", "ensur*", "constrain", "*secur*", "alert", "rebuild", "symptoms", "fight", "*beat*", "*war", "*prevail*", "nation", "*vaccin*", "*cure*", "*moral*", "mental*", "treated", "thought*", "*understand*", "*curing*", "*remed*", "*effort*", "*home*", "*manag*", "*control*", "*help*", "*thank*", "*clap*", "*beat*", "win")
blame_words = c("chin*", "blam*", "strateg*")
economy_words = c("firm*", "financ*", "econom*", "innovat*", "trade*", "trading")

# create list from ¿¿string??
corona_dictionary_list <- as.list(corona_words)
crisis_dictionary_list <- as.list(crisis_words)
manage_dictionary_list <- as.list(manage_words)

## names() is a replacement function¿? so here the updated object is assigned corona_dictionary_list, is that correct?
names(corona_dictionary_list) <- corona_words
names(crisis_dictionary_list) <- crisis_words
names(manage_dictionary_list) <- manage_words

#create dictionary from a string of words converted into a list (is this correct?)
corona_dictionary <- dictionary(corona_dictionary_list)
crisis_dictionary <- dictionary(crisis_dictionary_list)
manage_dictionary <- dictionary(manage_dictionary_list)

#create dataframematrix that contains frequency counts of corona_dictionary words in d_data
dfm_results_corona <- dfm_lookup(d_dfm, dictionary = corona_dictionary,valuetype = "glob", case_insensitive = TRUE) 
dfm_results_crisis <- dfm_lookup(d_dfm, dictionary = crisis_dictionary,valuetype = "glob", case_insensitive = TRUE) 
dfm_results_manage <- dfm_lookup(d_dfm, dictionary = manage_dictionary,valuetype = "glob", case_insensitive = TRUE) 

counts_corona <- convert(dfm_results_corona, to ="data.frame") # exporteer counts naar een dataframe
counts_crisis <- convert(dfm_results_crisis, to ="data.frame") # exporteer counts naar een dataframe
counts_manage <- convert(dfm_results_manage, to ="data.frame") # exporteer counts naar een dataframe

rownames(counts_corona) <- counts_corona$doc_id # maak de waarden van variable de namen van de rijen
counts_corona$doc_id <- NULL # verwijder kolom 1
colSums(counts_corona) # som alle waarden in iedere kolom

rownames(counts_crisis) <- counts_crisis$doc_id
counts_crisis$doc_id <- NULL
colSums(counts_crisis)

rownames(counts_manage) <- counts_manage$doc_id
counts_manage$doc_id <- NULL
colSums(counts_manage)

textplot_wordcloud(dfm_trim(dfm_results_manage, min_termfreq = 1, max_words = 40), comparison = TRUE, color = c("darkgrey", "black"))

